Introduction

So, in Part 1 of the workshop, we worked through our raw data to make it fit for downstream analysis. This type of data is called as processed/clean data.


Loading packages:

library(tidyverse)

Loading our processed data from Part 1:

eNorm <- read.delim("data/eNorm.txt", sep = "\t")
eNorm <- eNorm %>% 
  column_to_rownames(var = "gene")

pDat <- read.delim("data/GSE157103_formatted_pDat.txt", sep = "\t")
pDat <- pDat %>% 
  column_to_rownames(var = "ID")

6.0 PCA

We’ll start with principle component analysis (PCA) - a dimensionality reduction method that accounts for sample variation while maximizing variance.

## transforming eNorm values to log2(x)+1
e_log2 <- log2(eNorm + 1)

## transposing our log(x+1) transposed data frame, so that the columns 
## become the rows, and the rows become columns. As we want to check 
## the variance driven by the genes, and not the samples, we transpose 
## the dataframe to have the rows as the samples, and the columns as the genes, 
## as the PCA function performs column-wise applications, not row-wise.
t_log2 <- as.data.frame(t(e_log2))


## As our data has already been normalized, we don't want to scale it further. 
## We do however, want to centre it - meaning standardizing the upper and lower 
## limits of the distribution of our values
pca <- prcomp(t_log2, scale = FALSE, center = TRUE)

# print(pca)
# summary(pca)
# screeplot(pca)

library("factoextra", quietly = TRUE)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
eig.val<-get_eigenvalue(pca)
head(eig.val)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  1685.5669        34.022321                    34.02232
## Dim.2  1013.4825        20.456635                    54.47896
## Dim.3   284.6378         5.745272                    60.22423
## Dim.4   266.1820         5.372750                    65.59698
## Dim.5   173.9535         3.511165                    69.10814
## Dim.6   125.3862         2.530857                    71.63900
## dataframe with all PCs, their variance, and cumulative variance of all PCs
summary <- data.frame(PC = 1:126, var_explained = (pca$sdev)^2 / sum((pca$sdev)^2), 
                      cumulative = cumsum(pca$sdev^2 / sum(pca$sdev^2))
                      )
summary <- summary %>% 
  mutate(cumulative_perc = cumulative*100)


## we only consider the first 30 PCs
summary <- summary[1:30,]

## different ways to represent the same data
summary %>%
  ggplot(aes(x = sort(as.factor(PC)), y = var_explained)) +
  geom_bar(stat = "identity", fill = "forest green") +
  geom_text(aes(label = round(var_explained, digits = 2), vjust = -0.8), size = 2) +
  theme_minimal() +
  labs(title = "Variance Explained by each PC") 

summary %>%
  ggplot(aes(x = sort(as.factor(PC)), y = var_explained))+
  geom_point(colour = "forest green") +
  geom_line(group = "PC", colour = "forest green") +
  theme_minimal() +
  labs(title = "Variance Explained by each PC")

## or easily used by calling function in the "factoextra" package
fviz_eig(pca, col.var="blue")

summary %>%
  ggplot(aes(x = sort(as.factor(PC)), y = cumulative_perc))+
  geom_point(colour = "forest green") +
  geom_line(group = "PC", colour = "forest green") +
  theme_minimal() +
  labs(title = "Cumulative Proportion of Variation") 

## separating the PCA values into its own separate df
scores <- as.data.frame(pca$x)

## take the first 30
scores <- scores[c(1:30)]
head(scores)
##           PC1        PC2        PC3        PC4        PC5        PC6        PC7
## C1  -2.873577  31.513465   4.581241 -34.747719  15.702070 -29.361896   8.508419
## C2   1.069681  21.246050 -10.968403   8.339987  -3.260639 -17.241597  -3.523015
## C3 -21.265732 -16.398176  11.405154 -49.406015 -10.698952  -6.128537  -1.064602
## C4   8.360660  48.599969   9.782527  12.568988   2.640562  -2.135567  -3.014367
## C5 -52.264090   7.886006   7.748767 -25.925634   9.453192  15.599042  -6.826804
## C6  13.613215  29.320890  -6.705883  15.198558  -7.176141   1.188698 -10.093523
##           PC8        PC9       PC10       PC11       PC12        PC13      PC14
## C1 -10.146967 -1.7393485   8.528270 -2.4318774  1.7644941  -5.1124617  1.642345
## C2  -3.660366 -0.8253942  -1.797925 -5.2304072 -0.7391959 -15.2686948 -2.139551
## C3  -5.071375 18.4334570  14.031947 -3.2970120 -6.8373987  -0.4272142  1.500038
## C4  -2.971492 -5.1869985   1.106609  0.6095576 -5.0401862  -6.4507389  2.681354
## C5   2.492451 -2.4695017 -19.801383 -5.0695824 -6.2003436  -2.4090045 11.883342
## C6  -3.208975  2.7844656  -6.127613 -1.0578476  0.4531912 -10.9567302  3.506287
##         PC15      PC16      PC17       PC18      PC19      PC20       PC21
## C1 -8.875898  9.958008 -2.257577  0.0760982 5.0013025 -3.701882  11.100054
## C2 -1.673642 -3.371377  3.338661 -4.5877960 4.8236714 -2.603036   4.839628
## C3  1.668607  8.304741 -5.059460  6.8785575 9.3845361  3.353635   5.716499
## C4 -9.781740  3.395398  2.955598  3.5514763 4.7594520  0.723566   2.178126
## C5  3.812600 10.180299 19.058720 -4.1820518 1.2522601  6.604281 -12.063440
## C6  1.709447 -2.951148  2.932438 -2.6384673 0.2379606 -2.517254   5.712442
##          PC22       PC23       PC24      PC25       PC26       PC27      PC28
## C1 -1.1830862 -4.3701516 -6.0269453  2.487731 -3.9176775  7.2084391 -1.931781
## C2 -0.1437366 -4.0622984  0.6567662  1.469711 -3.0097441  1.4143064 -1.387003
## C3  7.6056574  1.1696635 -1.0229573  1.078533  1.9816533 -0.2157957  1.227088
## C4 -5.2577541  8.6100230  1.0761861 -0.495693  0.2191656 -0.8076055  2.522992
## C5  4.2031459  4.3432118  3.7112434  3.547634  3.0213863  4.0804927 -4.757769
## C6  2.2821878 -0.9523062 -1.6960234  2.700263  3.7529861  2.8236221 -2.476955
##          PC29        PC30
## C1  7.1364450 -1.27354936
## C2 -2.0213152 -1.19972570
## C3  7.3628155 12.38335042
## C4 -3.9418235 -0.03496177
## C5  0.7855329  0.31440811
## C6  0.1467464  0.12057683
## making a metadata data.frame containing all sample information data
mDat <- cbind(pDat, scores)

Now that we have our PC scores, we’ll estimate which of our variables are the ones driving that variation in our data

var <- get_pca_var(pca)
var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
library("corrplot", quietly = TRUE)
## corrplot 0.92 loaded
## contribution: first 20 genes, first 10 PCs
corrplot(var$contrib[1:20, 1:10], is.corr=FALSE)

## Cos2 is called square cosine (squared coordinates) and corresponds 
## to the quality of representation of variables
corrplot(var$cos2[1:20, 1:10], is.corr=FALSE)

## NOT RUN
# fviz_pca_var(pca,
#              col.var = "cos2", # Color by the quality of representation
#              gradient.cols = c("darkorchid4", "gold", "darkorange"),
#              repel = TRUE
#              )

## top10 contributions of variables to PC1
a<-fviz_contrib(pca, choice = "var", axes = 1, top=10)
## top 10 contributions of variables to PC2
b<-fviz_contrib(pca, choice = "var", axes = 2, top=10)
library("gridExtra", quietly = TRUE)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(a,b, ncol=2, top='Top10 contribution of the variables to the first two PCs')

## The results, for individuals (athletes) will be extracted using the 
## function get_pca_ind(). Similarly to variables, it provides a list 
## of matrices containing all the results for the individuals (coordinates, 
## correlation between individuals and axes, squared cosine, and contributions).
ind <- get_pca_ind(pca)
ind
## Principal Component Analysis Results for individuals
##  ===================================================
##   Name       Description                       
## 1 "$coord"   "Coordinates for the individuals" 
## 2 "$cos2"    "Cos2 for the individuals"        
## 3 "$contrib" "contributions of the individuals"
fviz_pca_ind(pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("darkorchid4", "gold", "darkorange"),
             repel = TRUE
             )
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Total contribution on PC1 and PC2
fviz_contrib(pca, choice = "ind", axes = 1:2)

We’ll now plot the first 2 PCs with the variables that seem to be contributing to the most variance in the data.

mDat %>% 
  ggplot(aes(x = PC1, y = PC2, colour = COVID)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
  labs( x = "Principle Component 1", y = "Principle Component 2", title = "COVID: PC1 vs PC2") +
  scale_colour_manual(values = c("grey", "orange")) +
  theme_minimal() 

mDat %>% 
  ggplot(aes(x = PC1, y = PC2, colour = ICU)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
  labs( x = "Principle Component 1", y = "Principle Component 2", title = "ICU: PC1 vs PC2") +
  scale_colour_manual(values = c("grey", "blue")) +
  theme_minimal() 

mDat %>% 
  ggplot(aes(x = PC1, y = PC2, colour = Mechanical_Ventilation)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
  labs( x = "Principle Component 1", y = "Principle Component 2", title = "Mechanical Ventilation: PC1 vs PC2") +
  scale_colour_manual(values = c("grey", "purple")) +
  theme_minimal()

mDat %>% 
  # mutate(AP_score = case_when(
  #   APACHEII_Score <= 10 ~ "less_than_10",
  #   between(APACHEII_Score, 11, 20) ~ "eleven_to_20",
  #   between(APACHEII_Score, 21, 30) ~ "twentyone_to_30",
  #   between(APACHEII_Score, 31, 40) ~ "thirtyone_to_40",
  #   APACHEII_Score > 40 ~ "more_than_40")) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = APACHEII_Score)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
  labs( x = "Principle Component 1", y = "Principle Component 2", title = "APACHEII_Score", subtitle = "Score of disease-severity measured upon admittance to ICU") +
  theme_minimal() 

## Compare PC2 and PC3

mDat %>% 
  ggplot(aes(x = PC1, y = PC2, colour = ICU)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
  labs( x = "Principle Component 1", y = "Principle Component 2", title = "ICU: PC1 vs PC2") +
  scale_colour_manual(values = c("grey", "blue")) +
  theme_minimal() 

mDat %>% 
  ggplot(aes(x = PC2, y = PC3, colour = ICU)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(-100, 100), xlim = c(-100, 100)) +
  labs( x = "Principle Component 2", y = "Principle Component 3", title = "ICU: PC2 vs PC3") +
  scale_colour_manual(values = c("grey", "blue")) +
  theme_minimal() 

7.0 Differential Expression Analysis

Okay, now moving into DE analysis: we’re going to use the limma package, rather than the more popular DESeq2 or edgeR packages. There’s broadly 3 steps to pulling out DE genes:

  1. Specifying your variables of interest to generate a model in the form of a matrix
  2. Fitting our data to that model
  3. Applying Bayesian statistics to the results of our model
library(limma)
## step 1 
mm_covid <- model.matrix(~COVID, pDat) 
## always better to use an intercept, as the starting value is not forced to zero

all(rownames(pDat) == colnames(eNorm))
## [1] TRUE
## step 2
efit_COVID <- lmFit(eNorm, mm_covid)
## step 3
efit_COVID <- efit_COVID %>% 
  eBayes()


topTable(efit_COVID, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")
##                logFC    AveExpr         t      P.Value    adj.P.Val        B
## GBGT1     -14.346965  13.119938 -8.608852 2.704926e-14 3.208042e-10 21.74699
## MCM6       13.006445  20.821294  8.119125 3.863965e-13 1.759204e-09 19.24480
## SLC12A9   -95.036340 100.648629 -8.092907 4.449926e-13 1.759204e-09 19.11189
## SLC15A3   -59.232204  68.814556 -7.998129 7.405414e-13 2.195705e-09 18.63245
## MAPK8IP3  -18.433847  29.398845 -7.874219 1.437384e-12 3.409474e-09 18.00811
## CDC7        3.365151   5.656738  7.711515 3.417089e-12 6.754446e-09 17.19281
## AIF1     -323.686454 472.519567 -7.566778 7.346232e-12 1.244662e-08 16.47216
## CLSPN       2.952676   4.133007  7.437656 1.448122e-11 2.146841e-08 15.83317
## PITPNM1   -16.397367  24.699942 -7.409046 1.682170e-11 2.216727e-08 15.69211
## TWF2      -84.289346 100.564424 -7.327147 2.580083e-11 3.059979e-08 15.28940
topTable(efit_COVID, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "logFC")
##               logFC   AveExpr         t      P.Value    adj.P.Val           B
## S100A9   -10554.113 22372.842 -2.567381 1.142823e-02 2.912609e-02 -3.14921169
## HBA2      -4627.381  2670.516 -5.072714 1.385177e-06 3.487942e-05  5.06699674
## ACTB      -4416.516  7242.785 -6.023213 1.782930e-08 1.639190e-06  9.14052918
## FTL       -4187.909  6897.358 -5.544648 1.679337e-07 7.572980e-06  7.03827981
## IFITM2    -2848.844  5357.986 -3.659910 3.710775e-04 1.980639e-03 -0.09398016
## RNA28SN1  -2740.657  2785.014 -5.705683 7.982667e-08 4.733722e-06  7.73475274
## RNA28SN4  -2351.735  1437.686 -7.147384 6.556764e-11 4.860201e-08 14.41135701
## EEF1A1    -2146.417  8071.037 -3.468840 7.182910e-04 3.236676e-03 -0.69295220
## FCGR3B     2140.546  6069.773  2.383893 1.863926e-02 4.307319e-02 -3.56958200
## HBA1      -2103.973  1667.291 -3.847665 1.893569e-04 1.225359e-03  0.51944745
## google GBGT1, S100A9 and COVID - what do we find?

We know from our PCA that age doesn’t seem to contribute to the variation observed. Let’s check whether controlling for age in our model changes the results we obtained

## We'll first get some statistics on the quality of our model with including age

## logistic requires categorical to be either yes or no (1 or 0)
model1 <- glm(as.factor(COVID) ~ Age, data = pDat, family = binomial)
summary(model1)
## 
## Call:
## glm(formula = as.factor(COVID) ~ Age, family = binomial, data = pDat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9352   0.5739   0.6543   0.7122   0.7818  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  2.08533    0.88314   2.361   0.0182 *
## Age         -0.01187    0.01353  -0.877   0.3804  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 128.29  on 125  degrees of freedom
## Residual deviance: 127.49  on 124  degrees of freedom
## AIC: 131.49
## 
## Number of Fisher Scoring iterations: 4
## Here the summary shows that age does not seem to strongly correlate 
## with COVID status, and so hence we would not expect a major change 
## in our results on including it in our model. However, just to test 
## that, let's add it to out model and check the resutls.

mm_age <- model.matrix(~COVID + Age, pDat) 

efit_age <- lmFit(eNorm, mm_age) %>% 
  eBayes()

topTable(efit_age, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")
##                logFC    AveExpr         t      P.Value    adj.P.Val        B
## GBGT1     -14.046860  13.119938 -8.550910 3.893914e-14 4.618182e-10 21.40403
## MCM6       12.757246  20.821294  8.034319 6.344663e-13 3.003183e-09 18.77807
## SLC12A9   -93.350898 100.648629 -8.000681 7.596584e-13 3.003183e-09 18.60857
## SLC15A3   -58.215586  68.814556 -7.902345 1.284422e-12 3.808311e-09 18.11420
## MAPK8IP3  -18.489458  29.398845 -7.844907 1.743990e-12 4.136744e-09 17.82627
## CDC7        3.324990   5.656738  7.608243 6.104652e-12 1.206686e-08 16.64678
## AIF1     -321.225164 472.519567 -7.472091 1.247953e-11 2.114389e-08 15.97360
## PITPNM1   -16.437456  24.699942 -7.376130 2.060166e-11 2.972291e-08 15.50168
## TTC9      -24.915019  19.432974 -7.350907 2.349422e-11 2.972291e-08 15.37799
## CLSPN       2.923208   4.133007  7.338497 2.506148e-11 2.972291e-08 15.31720
topTable(efit_age, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "logFC")
##               logFC   AveExpr         t      P.Value    adj.P.Val          B
## S100A9   -10160.956 22372.842 -2.468919 1.491768e-02 3.595278e-02 -3.3784598
## HBA2      -4648.774  2670.516 -5.061858 1.465260e-06 3.794319e-05  5.0153718
## ACTB      -4316.973  7242.785 -5.916899 3.003894e-08 2.698953e-06  8.6518054
## FTL       -4089.673  6897.358 -5.436198 2.784468e-07 1.182517e-05  6.5659556
## RNA28SN1  -2764.601  2785.014 -5.723978 7.430527e-08 5.123607e-06  7.8026755
## IFITM2    -2700.867  5357.986 -3.528938 5.866230e-04 2.833951e-03 -0.5090658
## RNA28SN4  -2357.059  1437.686 -7.114010 8.004557e-11 6.328936e-08 14.2240520
## EEF1A1    -2349.765  8071.037 -4.066164 8.445041e-05 6.874275e-04  1.2600294
## FCGR3B     2193.313  6069.773  2.430801 1.650034e-02 3.894029e-02 -3.4649414
## HBA1      -2114.040  1667.291 -3.839525 1.957299e-04 1.271976e-03  0.4898601
## We see that when arranged by logFC and by adjusted pvalue our model with 
## and without age shows the same ordering of the genes.

We saw that lactate concentration was contributing to PC2. Let’s check if we should be adjusting for this variable.

mm_lactate <- model.matrix(~COVID + Lactate_mmol.l , pDat) 
mm_lactate_df <- as.data.frame(mm_lactate) 

lactate_logres <- glm(COVIDyes ~ Lactate_mmol.l, data = mm_lactate_df, family = binomial)
summary(lactate_logres)
## 
## Call:
## glm(formula = COVIDyes ~ Lactate_mmol.l, family = binomial, data = mm_lactate_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1839   0.4396   0.4876   0.6916   1.3768  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      2.2880     0.3943   5.803  6.5e-09 ***
## Lactate_mmol.l  -0.8370     0.2641  -3.170  0.00153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 128.29  on 125  degrees of freedom
## Residual deviance: 114.03  on 124  degrees of freedom
## AIC: 118.03
## 
## Number of Fisher Scoring iterations: 5
## The summary shows that lactate indeed does seem to be significantly associated 
## with COVID status. Let's visualise that
mm_lactate_df %>%
  ggplot(aes(x = Lactate_mmol.l, y = COVIDyes)) +
  geom_point(alpha = 0.2, colour = "orange") +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), colour = "orange") +
  theme_minimal() +
  labs(title = "Does lactate concentration inform of COVID status?", x = "Lactate (mmol/l)", y = "Probability of COVID-positive status")
## `geom_smooth()` using formula 'y ~ x'

## so now we know that there is a significant association with lactate levels and 
## the probability of having COVID. Let's add lactate to our linear model
efit_lactate <- lmFit(eNorm, mm_lactate) %>% 
  eBayes()

topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")
##                 logFC     AveExpr         t      P.Value    adj.P.Val        B
## GBGT1      -14.100701   13.119938 -7.922312 1.154622e-12 1.369382e-08 18.16473
## MCM6        13.263355   20.821294  7.753202 2.837841e-12 1.682840e-08 17.32187
## CDC7         3.529122    5.656738  7.600076 6.372660e-12 1.698344e-08 16.56355
## PELI3       -3.847807    4.537970 -7.598845 6.414097e-12 1.698344e-08 16.55747
## MAPK8IP3   -18.933196   29.398845 -7.577943 7.159967e-12 1.698344e-08 16.45435
## SLC15A3    -59.226784   68.814556 -7.483197 1.177384e-11 2.327296e-08 15.98809
## HAAO        -3.877558    3.365732 -7.443598 1.448515e-11 2.454199e-08 15.79380
## NAPSA      -29.225402   24.012864 -7.415859 1.674445e-11 2.482365e-08 15.65792
## METRNL     -18.080742   15.634706 -7.384769 1.969362e-11 2.595182e-08 15.50584
## RNA18SN2 -1933.844371 1965.367916 -7.305644 2.972739e-11 2.938057e-08 15.11982
topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "logFC")
##              logFC   AveExpr         t      P.Value    adj.P.Val         B
## HBA2     -5206.811  2670.516 -5.407114 3.175498e-07 1.226756e-05  6.440438
## FTL      -4358.981  6897.358 -5.408454 3.156355e-07 1.226756e-05  6.446072
## ACTB     -4327.541  7242.785 -5.524856 1.860821e-07 8.827734e-06  6.938547
## B2M       3371.789 16010.351  2.430849 1.649823e-02 3.649595e-02 -3.434133
## RNA28SN1 -3152.997  2785.014 -6.283311 5.161600e-09 8.745225e-07 10.288178
## EEF1A1   -2603.769  8071.037 -4.003939 1.067173e-04 7.237743e-04  1.061398
## TXNIP     2533.727  6352.420  2.621531 9.852509e-03 2.406813e-02 -2.990803
## FCGR3B    2494.685  6069.773  2.611941 1.011797e-02 2.458496e-02 -3.013814
## RNA28SN4 -2483.527  1437.686 -7.097047 8.733519e-11 6.092914e-08 14.109575
## HBA1     -2331.664  1667.291 -4.010926 1.039621e-04 7.102482e-04  1.085312

Let’s compare the expression of GBGT1 and HBA2 between COVID positive and negative patients.

pDat <- pDat %>% 
  rownames_to_column(var = "sample")

covid <- pDat %>% 
  dplyr::select(sample, COVID)

GBGT1 <- eNorm %>%
  rownames_to_column(var = "gene") %>%
  filter(gene == "GBGT1") %>%
  column_to_rownames(var = "gene")

GBGT1 <- as.data.frame(t(GBGT1))

GBGT1 <- GBGT1 %>%
  rownames_to_column(var = "sample")

GBGT1 <- GBGT1 %>%
  left_join(covid, by = "sample")

GBGT1 %>%
  ggplot(aes(x = COVID, y = log2(GBGT1), fill = COVID)) +
  geom_violin() +
  scale_fill_manual(values = c("gray", "orange")) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(x = "COVID Status", y = "log2 (GBGT1 RPM)", title = "GBGT1: Gene with lowest adjusted p-value with or without accounting for lactate")

HBA2 <- eNorm %>% 
  rownames_to_column(var = "gene") %>% 
  filter(gene == "HBA2") %>% 
  column_to_rownames(var = "gene")
  
HBA2 <- as.data.frame(t(HBA2))

HBA2 <- HBA2 %>% 
  rownames_to_column(var = "sample")
  
HBA2 <- HBA2 %>% 
  left_join(covid, by = "sample")

HBA2 %>% 
  ggplot(aes(x = COVID, y = log2(HBA2), fill = COVID)) +
  geom_violin() +
  geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.7, fill = "black") +
  scale_fill_manual(values = c("gray", "orange")) +
  theme_minimal() + 
  theme(legend.position = "bottom") +
  labs(x = "COVID Status", y = "log2 (HBA2 RPM)", title = "HBA2: Gene with highest negative logFC change on including lactate concentration in the model")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

# HBA2 %>%
#   ggplot(aes(x = sample, y = log2(HBA2), colour = COVID)) +
#   geom_point() +
#   scale_colour_manual(values = c("gray", "orange")) +
#   theme_minimal() +
#   theme(legend.position = "bottom") +
#   labs(x = "COVID Status", y = "log2 (HBA2 RPM)", title = "HBA2: Gene with highest negative logFC change on including lactate concentration in the model") +
#   facet_grid(~COVID)

a popular way of combining the above information - p-values and FCs is a volcano plot. Let’s now select all of the DEG that we got when we included lactate measurement in out model to generate a volcano plot.

volcanoplot(efit_lactate, coef = "COVIDyes", highlight=5, style = "p-value", names = rownames(efit_lactate$coefficients), hl.col="blue")

volcanoplot(efit_lactate, coef = "COVIDyes", highlight=5, style = "p-value", names = rownames(efit_lactate$coefficients), hl.col="blue", xlim=c(-500,500))

a = topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")

8.0 Enrichment Analysis

# BiocManager::install("biomaRt")

library(biomaRt)

listMarts() #gives us the list of databases available
## Ensembl site unresponsive, trying useast mirror
##                biomart                version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 108
## 2   ENSEMBL_MART_MOUSE      Mouse strains 108
## 3     ENSEMBL_MART_SNP  Ensembl Variation 108
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 108
ensembl=useMart("ensembl")

## which all species are present from the ensembl database?
head(listDatasets(ensembl))
##                        dataset                           description
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1)
## 2     acalliptera_gene_ensembl      Eastern happy genes (fAstCal1.2)
## 3   acarolinensis_gene_ensembl       Green anole genes (AnoCar2.0v2)
## 4    acchrysaetos_gene_ensembl       Golden eagle genes (bAquChr1.2)
## 5    acitrinellus_gene_ensembl        Midas cichlid genes (Midas_v5)
## 6    amelanoleuca_gene_ensembl       Giant panda genes (ASM200744v2)
##       version
## 1 ASM259213v1
## 2  fAstCal1.2
## 3 AnoCar2.0v2
## 4  bAquChr1.2
## 5    Midas_v5
## 6 ASM200744v2
## we'll make a variable selecting the Homo sapiens dataset
mart = useMart("ensembl", dataset="hsapiens_gene_ensembl")
## Ensembl site unresponsive, trying uswest mirror
## Using the DEGs we got from the lactate model
genes <- topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.01, n = Inf, sort.by = "logFC")
genes <- rownames(genes)

## We'll only use the top 200 genes, as the maximum number of queries biomaRt can take is 500
genes <- genes[1:200]

## we require the Entrz IDs for all functions after this step - so converting HGNC Symbols to Entrez IDs
hgnc_to_entrez <- getBM(attributes = c("hgnc_symbol", "entrezgene_id"), filters = "hgnc_symbol", values = genes, mart = mart)
## check all choices
filters <- listFilters(mart) #filters are the parameters you search by
attr <- listAttributes(mart) #attributes are the matching parameters you're wanting to search for

head(hgnc_to_entrez)
##   hgnc_symbol entrezgene_id
## 1        ACTB            60
## 2       ACTG1            71
## 3       ADAM8           101
## 4        ADAR           103
## 5        AIF1           199
## 6       ALDOA           226
## selecting attributes as the GO id, the GO term, the GO term definition, and the cell 
## compartment that GO term belongs to, searching by the filter/parameter HGNC symbol
go_terms <- getBM(attributes = c("hgnc_symbol", "go_id", "name_1006", "definition_1006", "namespace_1003"), filters = "hgnc_symbol", values = genes, mart = mart)

head(go_terms)
##   hgnc_symbol      go_id                  name_1006
## 1        ACTB GO:0005634                    nucleus
## 2        ACTB GO:0005737                  cytoplasm
## 3        ACTB GO:0016020                   membrane
## 4        ACTB GO:0032991 protein-containing complex
## 5        ACTB GO:0000166         nucleotide binding
## 6        ACTB GO:0005524                ATP binding
##                                                                                                                                                                                                                                                                                                                                          definition_1006
## 1 A membrane-bounded organelle of eukaryotic cells in which chromosomes are housed and replicated. In most cells, the nucleus contains all of the cell's chromosomes except the organellar chromosomes, and is the site of RNA synthesis and processing. In some species, or in specialized cell types, RNA metabolism or DNA replication may be absent.
## 2                                                                                                                                                                                                                                          The contents of a cell excluding the plasma membrane and nucleus, but including other subcellular structures.
## 3                                                                                                                                                                                                                                                    A lipid bilayer along with all the proteins and protein complexes embedded in it an attached to it.
## 4                                                                                                                                                      A stable assembly of two or more macromolecules, i.e. proteins, nucleic acids, carbohydrates or lipids, in which at least one component is a protein and the constituent parts function together.
## 5                                                                                                                                                                     Binding to a nucleotide, any compound consisting of a nucleoside that is esterified with (ortho)phosphate or an oligophosphate at any hydroxyl group on the ribose or deoxyribose.
## 6                                                                                                                                                                                                                                                      Binding to ATP, adenosine 5'-triphosphate, a universally important coenzyme and enzyme regulator.
##       namespace_1003
## 1 cellular_component
## 2 cellular_component
## 3 cellular_component
## 4 cellular_component
## 5 molecular_function
## 6 molecular_function
## deleting all empty rows
go_terms <- go_terms %>% 
  mutate_all(na_if,"")
go_terms <- na.omit(go_terms)

## counting the frequency of each GO term
go_plot <- go_terms %>% 
  dplyr::count(name_1006) %>% 
  dplyr::arrange(desc(n))

## we know that the total DEGs we selected were 200, so let's get the 
## percentage of how many of the genes were associated with a particular GO Term
head(go_plot)
##               name_1006   n
## 1       protein binding 169
## 2               cytosol 138
## 3              membrane 133
## 4             cytoplasm 131
## 5 extracellular exosome  93
## 6               nucleus  89
go_plot$total <- 200
go_plot <- go_plot[-1,]
go_plot <- go_plot %>% 
  mutate(perc = (n/total)*100) %>% 
  dplyr::arrange()

## for the first 20 GO Terms
go_plot[1:20,] %>%
  ggplot(aes(x = reorder(name_1006, -perc), y = perc)) +
  geom_bar(stat = "identity", width = 0.6) +
  coord_cartesian(y = c(0,100)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 20 GO Terms", subtitle = "For DEGs at adjpval <= 0.05", x = "GO Term", y = "Percentage of DEGs assoc. with GO Term")

## let's all add the cellular compartment to our plot
component <- go_terms %>% 
  dplyr::select(name_1006, namespace_1003) %>% 
  distinct(name_1006, .keep_all = TRUE)

head(component)
##                    name_1006     namespace_1003
## 1                    nucleus cellular_component
## 2                  cytoplasm cellular_component
## 3                   membrane cellular_component
## 4 protein-containing complex cellular_component
## 5         nucleotide binding molecular_function
## 6                ATP binding molecular_function
go_plot <- go_plot %>% 
  right_join(component, by = "name_1006")

head(go_plot)
##               name_1006   n total perc     namespace_1003
## 1               cytosol 138   200 69.0 cellular_component
## 2              membrane 133   200 66.5 cellular_component
## 3             cytoplasm 131   200 65.5 cellular_component
## 4 extracellular exosome  93   200 46.5 cellular_component
## 5               nucleus  89   200 44.5 cellular_component
## 6           RNA binding  88   200 44.0 molecular_function
go_plot[1:20,] %>% 
  ggplot(aes(x = reorder(name_1006, -perc), y = perc, fill = namespace_1003)) +
  geom_bar(stat = "identity", width = 0.6) +
  scale_fill_manual(values = c("maroon", "navy", "forest green")) +
  coord_cartesian(y = c(0,100)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  labs(title = "Top 20 GO Terms", subtitle = "For DEGs at adjpval <= 0.05", x = "GO Term", y = "Percentage of DEGs assoc. with GO Term")

Let’s look at the KEGG pathways associated with our DEGs

# BiocManager::install("clusterProfiler")
library(clusterProfiler, quietly = TRUE)
## 
## clusterProfiler v4.4.4  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following object is masked from 'package:stats':
## 
##     filter
k <- enrichKEGG(gene = genes, organism = "hsa")
## Reading KEGG annotation online:
## Reading KEGG annotation online:
## --> No gene can be mapped....
## --> Expected input gene ID: 27349,23205,5373,80201,7366,729920
## --> return NULL...
## when we input our genes as HGNC IDs, the function doesn't work. We'll use our Entrez IDs that we have

head(hgnc_to_entrez)
##   hgnc_symbol entrezgene_id
## 1        ACTB            60
## 2       ACTG1            71
## 3       ADAM8           101
## 4        ADAR           103
## 5        AIF1           199
## 6       ALDOA           226
k <- enrichKEGG(gene = hgnc_to_entrez$entrezgene_id, organism = "hsa")
head(k)
##                ID                    Description GeneRatio  BgRatio
## hsa03010 hsa03010                       Ribosome    76/149 158/8191
## hsa05171 hsa05171 Coronavirus disease - COVID-19    78/149 232/8191
## hsa04145 hsa04145                      Phagosome    11/149 152/8191
## hsa05140 hsa05140                  Leishmaniasis     7/149  77/8191
## hsa05416 hsa05416              Viral myocarditis     6/149  60/8191
## hsa05152 hsa05152                   Tuberculosis    10/149 180/8191
##                pvalue     p.adjust       qvalue
## hsa03010 4.621372e-97 8.364683e-95 7.588779e-95
## hsa05171 1.147343e-84 1.038345e-82 9.420288e-83
## hsa04145 9.586089e-05 5.783607e-03 5.247122e-03
## hsa05140 4.747203e-04 2.148109e-02 1.948852e-02
## hsa05416 7.302291e-04 2.643429e-02 2.398226e-02
## hsa05152 1.599476e-03 4.825087e-02 4.377515e-02
##                                                                                                                                                                                                                                                                                                                                                                                                          geneID
## hsa03010          2197/6134/4736/6135/6136/6137/23521/9045/6138/6139/6141/6142/6143/6144/6146/9349/6147/6152/6154/6155/6157/6158/6159/6122/6156/6160/6161/6164/11224/6165/6173/6167/6168/6169/6170/6124/6171/6125/6128/6129/6130/6132/6133/6175/6176/6181/6204/6205/6206/6208/6209/6210/6217/6218/6222/6223/6187/6224/6228/6229/6230/6231/6232/6233/6234/6235/6188/6189/6191/6193/6194/6201/6202/6203/3921/7311
## hsa05171 103/2197/6134/4736/6135/6136/6137/23521/9045/6138/6139/6141/6142/6143/6144/6146/9349/6147/6152/6154/6155/6157/6158/6159/6122/6156/6160/6161/6164/11224/6165/6173/6167/6168/6169/6170/6124/6171/6125/6128/6129/6130/6132/6133/6175/6176/6181/6204/6205/6206/6208/6209/6210/6217/6218/6222/6223/6187/6224/6228/6229/6230/6231/6232/6233/6234/6235/6188/6189/6191/6193/6194/6201/6202/6203/3921/6772/7311
## hsa04145                                                                                                                                                                                                                                                                                                                                                   60/71/9114/11151/1535/3113/3115/3123/3689/10312/7846
## hsa05140                                                                                                                                                                                                                                                                                                                                                                     1535/1915/3113/3115/3123/3689/6772
## hsa05416                                                                                                                                                                                                                                                                                                                                                                              60/71/3113/3115/3123/3689
## hsa05152                                                                                                                                                                                                                                                                                                                                                     9114/972/11151/3113/3115/3123/3689/4046/6772/10312
##          Count
## hsa03010    76
## hsa05171    78
## hsa04145    11
## hsa05140     7
## hsa05416     6
## hsa05152    10
kegg_res = k@result